############################# Model Building ############################

################################################# set 'working Directory'
setwd("D:/AS17 translation/AS17 supplements")
opar <- par() 
#options(warn=-1)
#########################################################################
#                            Data / Sources
#         cholesterin.csv
#              psdata.csv
#    skin cancer data.csv
#      lme data broad.csv
#      clustered data.csv
#           teratogen.csv
#      toxicity study.csv
# Death rates 2015-17.xlsx
#########################################################################

                                                            # Section 8.1
x    <- rnorm(50, mean=10, sd=5)                         
stat <- function(x) quantile(x, probs=0.25); stat(x)
B    <- 1000; t <- rep(NA, B)                            # Bootstrap
for (i in 1:B) t[i] <- stat(sample(x, replace=TRUE))
E.stat <- mean(t); E.stat                                # estimator
V.stat <- var(t); V.stat                                 # variance

#########################################################################

x     <- rnorm(50, mean=10, sd=5)
theta <- function(x) quantile(x, probs=0.95); theta(x)
n     <- length(x)
t     <- rep(NA, n)
for  (i in 1:n) t[i] = theta(x[-i])                     # Jackknife
JE.theta <- mean(t); JE.theta                           # estimator
JV.theta <- ((n-1)/n) * sum((t-JE.theta)^2); JV.theta   # variance
JC.theta <- n*theta(x) - (n-1)*JE.theta; JC.theta       # correction

#########################################################################

x <- seq(0, 10, by = 0.5)
n <- length(x)                                        # linear regression
y <- 2 + 0.5*x; y <- y + rnorm(n, mean=0, sd=0.6)
resi <- lm(y~x)$residuals; sum(resi^2)              
err <- rep(NA, n)
for (i in 1:n) {                                       # crossvalidation
    mcf    <- lm(y[-i] ~ x[-i])$coef                   # coefficients
    y.hat  <- mcf[1] + mcf[2]*x[i]                     # estimator
    err[i] <- y[i] - y.hat  }                          # error
t.err <- sum(err^2); t.err

#########################################################################

                                                            # Section 8.2  

                                                      # linear regression
                                                 
#########################################################################

                                                          # Section 8.2.1

cholesterin <- read.table("cholesterin.CSV", sep=";",dec=",",header=T) 
# str(cholesterin)                                            # structure           
# edit(cholesterin)                                           # edit           
# summary(cholesterin)                                        # summary     
attach(cholesterin)                          
n    <- length(age)                   
m.x  <- mean(age);                        m.x
s.x  <- sd(age);                          s.x
m.y  <- mean(cholest);                    m.y
s.y  <- sd(cholest);                      s.y
                                # variances and covariance  
ss.xx <- sum((age - m.x)^2);                  ss.xx 
ss.yy <- sum((cholest  - m.y)^2);             ss.yy
ss.xy <- sum((age - m.x)*(cholest - m.y));    ss.xy
                                # linear regression (OLS)
beta1 <- ss.xy / ss.xx;                       beta1
beta0 <- m.y - beta1 * m.x;                   beta0

#########################################################################

                                                             # figure 8.1
par(lwd=2, font.axis=2, bty="n", ps=15, mfrow=c(1,1))  
plot(age, cholest, ylab="Cholesterol", xlab="age", las=1) # scatterplot       
abline(beta0, beta1, col="black")                         # regression line


#########################################################################

cholest.e <- beta0 + beta1*age; e <- cholest - cholest.e
                                                             # figure 8.2
par(lwd=2, font.axis=2, bty="n", ps=15, mfrow=c(1,2)) 
qqnorm(e, xlab = "Normal plot", ylab = "Residual", las=1,
                        ylim=c(-0.6,+0.6), main="" )
lines(c(-2, +2),c(-0.6, +0.6), lty=2)
plot(cholest.e, e, ylab="Residuals", las=1,
                  xlab="Cholesterol estimated", ylim=c(-0.6,+0.6))
abline(h=0, lty=2)

#########################################################################

lin.model <- lm(cholest ~ age)
summary(lin.model)

#########################################################################

anova(lin.model)

#########################################################################

par(lwd=2, font.axis=2, bty="n", ps=15, mfrow=c(1,1))  
plot(age, cholest, ylab="Cholesterol", xlab="age", las=1) # scatterplot       
abline(beta0, beta1, col="black")                         # regression line
chol.hat <- beta0 + beta1 * age                           # estimation  
for (i in 1:n)                                            # residuals
lines(c(age[i],age[i]), c(cholest[i],chol.hat[i]), col="red", lty=2)
resid <- cholest - chol.hat
                                                          # ANOVA       
ss.regression <- sum((chol.hat - m.y)^2);       ss.regression
ms.regression <- ss.regression
ss.residual   <- sum((chol.hat - cholest)^2);   ss.residual
ms.residual   <- ss.residual/(n-2)
F.ratio       <- ms.regression/ms.residual;     F.ratio

ss.regression + ss.residual; ss.yy                        # == s.yy                     

                                                # correlation coefficient          
r.cor <- sqrt(ss.regression/ss.yy); r.cor 
cor(age, cholest)

#########################################################################

                                         
library(MASS)                                          # linear modelling  
y <- cholest;  x <- age;   n <- length(x)

lin.model <- lm(cholest ~ age)
summary(lin.model)

r <- resid(lin.model)
data <- cbind(x, y, r); data
                                          # elementary in R 
X <- cbind(rep(1,n), x) 

SSx <- ginv(t(X)%*%X)                     # SAQ

beta <- SSx %*% t(X) %*% y; beta          # estimate of coefficients 

SSr <- t(r)%*%r; SSr                      # SAQ in residuals          

MSr <- SSr/(n-2); MSr                     # MSQ in residuals        

sqrt(SSr/(n-2))*sqrt(SSx[2,2])            # standard error in beta1     

sqrt(SSr/(n-2))*sqrt(SSx[1,1])            # standard error in beta0     

SSy <-  t(beta) %*% t(X) %*% y

MSy <- SSy - n*(mean(y)^2)                # MSQ in linear regression       

#########################################################################

                                                          # Section 8.2.2

                                             # multiple linear regression
library(DAAG)
data(litters)
attach(litters)
litters[1:10,]
                                                             # figure 8.4
par(mfrow=c(1,2), lwd=2, font.axis=2, bty="l", ps=15)
plot(bodywt, lsize, ylab="Litter size", las=1,
     xlab="Body weight", cex=1.3)
plot(brainwt, lsize, ylab="Litter size", las=1,
     xlab="Brain weight", cex=1.3)

########################################################################

par(mfrow=c(1,1), lwd=2, font.axis=2, bty="o", ps=15) 
pairs(litters, labels=c("Litter size","Body weight","Brain weight"))

########################################################################

y <- lsize; n <- length(y); 
X <- matrix(c(rep(1,20), bodywt, brainwt), nrow=20,
            dimnames=list(1:20, c("one", "bodywt", "brainwt"))); p <- 2
t(X) %*% X                           # matrix product                     
xtxi <- solve(t(X) %*% X); xtxi   

b.h <- xtxi %*% t(X) %*% y; b.h      # estimation of parameters           

H   <- X %*% xtxi %*% t(X)           # computation of hat matrix         
y.h <- H %*% y;                      # estimation of (x %*% b.h)   
e.h <- y - y.h;                      # estimation error - residuals          
cbind(y, y.h, e.h)[1:10,]

RSS <- t(e.h) %*% e.h; RSS           # sum of squared deviations
s.h <- sqrt(RSS/(n-p-1))             # estimation of standard deviation
se.b  <- sqrt(diag(xtxi))*s.h; se.b  # standard error of estimate    
R <- 1 - RSS/sum((y-mean(y))^2); R   # compute R

######################################################################

                                      # linear model function lm()
fit <- lm(lsize ~ bodywt + brainwt, data=litters)
summary(fit)

#########################################################################
                                         


#########################################################################

                                                          # Section 8.2.4

library(DAAG)                                # residuals in linear models                                                                    
data(litters)
fit <- lm(lsize ~ bodywt + brainwt, data=litters)
res.diag <- influence.measures(fit)               
round(cooks.distance(fit), 3)                # influencing observations 
infl <- as.numeric(which(apply(res.diag$is.inf,1,any))); infl  

                                                             # figure 8.7
par(lwd=2, font.axis=2, bty="n", ps=15, mfrow=c(1,1)) 
plot(litters$lsize ~ hatvalues(fit), ylim=c(0,12), las=1,
        ylab="Litter size", xlab = "Leverage")
points(hatvalues(fit)[infl], litters$lsize[infl], cex=2, pch=4)
                                                                                                

#########################################################################

                                                          # Section 8.2.5

                                                     # heteroskedasticity
# x <- rnorm(25, 1500, 300)                    
# v <- exp(x/1000) * rnorm(25, 1, 50)
# y <- 0.5*x + v 

library(car)
x <- c(1516, 1416, 1205, 1547,  829, 1669,  476, 1448, 1982,  978, 1567, 1401,
       1494, 2069, 1609, 1465, 1491, 1598, 1708, 1597, 1446, 1074, 1525, 2135, 1723)
y <- c(771,  868,  912,  728,  327,  959,  249,  700, 1650,  570, 1159,  517,  905,
       792, 1039, 830,  866,  993, 1241, 1164,  821,  776,  600, 1596,  578)
mod <- lm(y~x); summary(mod)                         # linear regression
v <- vcov(mod); sqrt(c(v[1,1], v[2,2]))              # covariance matriox 
v <- hccm(mod, type="hc0"); sqrt(c(v[1,1], v[2,2]))  # corrected according HC0
v <- hccm(mod, type="hc1"); sqrt(c(v[1,1], v[2,2]))  # corrected according HC1
v <- hccm(mod, type="hc2"); sqrt(c(v[1,1], v[2,2]))  # corrected according HC2
v <- hccm(mod, type="hc3"); sqrt(c(v[1,1], v[2,2]))  # corrected according HC3

#########################################################################

                                                             # figure 8.8
par(mfcol=c(1,2), lwd=2.5, font.axis=2, bty="l", ps=15)  
plot(x, y, xlab="X [income]", ylab="Y [expenditures]", 
     las=1, ylim=c(0,1800), yaxp=c(0,1800,9))
abline(mod, lty=2)
abline(a=-100, b=1.00, lty=3)
abline(a=-100, b=0.32, lty=3)
mod <- lm(y~x); res <- residuals(mod)
plot(x, res, xlab="X [income]", ylab="Residuals", 
     las=1, ylim=c(-500,500), yaxp=c(-500,+500,10))
abline(h=0, lty=2)
abline(a=-50, b=0.30, lty=3)
abline(a=50, b=-0.30, lty=3)

#########################################################################

                                                          # Section 8.2.6

library(DAAG)                           # lineares Modell - Funktion lm()
data(litters)
fit <- lm(lsize ~ bodywt + brainwt, data=litters)
RSS <- sum(fit$res^2)
SYY <- sum((litters$lsize - mean(litters$lsize))^2)
p   <- 2;   n <- 20
F   <- ((SYY-RSS)/p)/(RSS/(n-p-1)); F
p   <- 1-pf(F, p, n-p-1); p

#########################################################################

fit <- lm(lsize ~ bodywt + brainwt, data=litters); attach(litters)
p   <- 2;   n <- 20
x0  <- c(1.0, 8.0, 0.4)                   # single observation         
y0  <- sum(x0*fit$coef)                   # estimation of litter size     
X   <- cbind(1, bodywt, brainwt)          # construction of X matrix          
xtxi  <- solve(t(X) %*% X)                # variance matri               

t     <- qt(0.975, n-p-1)                 # quantile from  t-distribution     
sigma <- sqrt(sum(fit$res^2)/(n-p-1))     # estimation standard deviation
W     <- sqrt(1 + x0 %*% xtxi %*% x0)     # root term                   
round(c(y0-t*sigma*W, y0, y0+t*sigma*W), 2)

#########################################################################

                                                          # Section 8.2.7
                                                            
library(DAAG)                                        # variable selection
data(litters)
fit <- lm(lsize ~ ., data=litters)
drop1(fit, test="F")

fit <- lm(lsize ~ 1, data=litters)
add1(fit, ~ bodywt + brainwt, test="F")

#########################################################################
                                                              
library(DAAG)                                                 # step AIC 
data(litters)
fit <- lm(lsize ~ ., data=litters)
step(fit)                                                 

#########################################################################

                                                            # Section 8.3

                                   # analysis of variance in linear model

#########################################################################

                                                        # Section 8.3.1.1
# hemmhof <- read.csv("hemmhof.CSV",sep=";",dec=",") 
# hemmhof <- as.data.frame(hemmhof); attach(hemmhof);  summary(hemmhof)

antibiotic <- as.factor(c(rep("A",4), rep("B",5), rep("C",3))); antibiotic
value <- c(13.2,14.1,7.8,11.7,15.9,16.2,19.3,18.0,17.3,6.8,9.2,12.4); value
fit <- lm(value ~ antibiotic - 1)
summary(fit)
model.matrix(fit)
#########################################################################

summary(lm(value ~ antibiotic))

#########################################################################

                                                        # Section 8.3.1.2

                                                        # dummy coding
fit <- lm(value ~ antibiotic, contrasts = list(antibiotic="contr.treatment"))
summary(fit)

#########################################################################
                                                       
contr.treatment(3, base = 1, contrasts = TRUE)         # coding contrasts

summary(aov(value ~ antibiotic))

model.matrix(fit)

#########################################################################

                                                        # Section 8.3.1.3

                                                        # effect coding
fit <- lm(value ~ antibiotic, contrasts = list(antibiotic="contr.sum"))
summary(fit)
                                         
contr.sum(3, contrasts = TRUE)                          # coding contrast

model.matrix(fit)

#########################################################################

                                                        # Section 8.3.1.4
                                                  
anova(lm(value ~ antibiotic))                       # variance components

                              
library(multcomp)                    # simultaneous CI according to Tukey
d    <- data.frame(Gruppe = factor(antibiotic), value)
amod <- aov(value ~ antibiotic, data=d)
summary(glht(amod, linfct = mcp(antibiotic = "Tukey")))

                                                            # figure 8.9
par(mfrow=c(1,1), lwd=2, font.axis=2, bty="n", ps=15) 
plot(glht(amod, linfct = mcp(antibiotic = "Tukey")), 
     main="Simultaneous CI according to Tukey",  
     xlab="Inhibition zone (95%-KI)")

#########################################################################

                                                          # Section 8.3.2
                                                        
                                        # two-factor analysis of variance
# hemmhof <- read.csv("hemmhof2.CSV",sep=";",dec=",") 
# hemmhof <- as.data.frame(hemmhof); attach(hemmhof)  

antibiotic <- as.factor(c(rep("A",4), rep("B",5), rep("C",3),
                      rep("A",3), rep("B",4), rep("C",4)))
concentrat <- as.factor(c(rep("low",12), rep("high",11)))
value      <- c(13.2,14.1,7.8,11.7,15.9,16.2,19.3,18.0,17.3,6.8,9.2,12.4,
             10.4,12.6,6.3,11.5,13.7,10.9,15.1,12.3,14.5,16.7,10.3)

fit <- lm(value ~ antibiotic + concentrat, 
          contrasts=list(antibiotic="contr.sum", concentrat="contr.sum"))         
summary(fit) 

anova(fit)

model.matrix(fit)

#########################################################################

                                                            # figure 8.10
par(mfrow=c(1,1), lwd=2, font.axis=2, bty="l", ps=15) 
interaction.plot(antibiotic, concentrat, value, las=1, lwd=3)

#########################################################################

fit1 <- update(fit, . ~ . + antibiotic:concentrat) # test for interaction       
anova(fit, fit1)


fit <- lm(value ~ antibiotic + concentrat + antibiotic:concentrat)         
anova(fit)

#########################################################################

                                                            # Section 8.4

                                                    # logistic regression
                                                   
p        <- seq(0, 1, 0.02)                         # logit transform  
logit.p  <- log(p/(1-p))
x        <- seq(-5, +5, 0.2)
ilogit.x <- 1 / (1+exp(-x))
                                                            # figure 8.11
par(mfrow=c(1,2), lwd=2, font.axis=2, bty="l", ps=14) 
plot(p, logit.p, type="b", xlim=c(0,1), ylim=c(-4, +4), las=1,
     xlab= expression(paste("Probability  ", pi)), 
     ylab= expression(log(pi / (1-pi))))
plot(x, ilogit.x, type="b", xlim=c(-5,+5), ylim=c(0,1), las=1,
     xlab= expression(paste("Value x")), 
     ylab= expression(paste("Probability  ", 1/(1 + exp(-x)))))

#########################################################################

                                                # example Challenger data
t <- c(66,67,68,70,72,75,76,79,53,58,70,75,67,67,69,70,73,76,78,81,57,63,70)
d <- c( 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1 ,1)
fit <- glm(d ~ t, family=binomial)
summary(fit)

#########################################################################

fit$coef
temp <- seq(30,90,by=2)
lin  <- fit$coef[1] + fit$coef[2]*temp
prob <- exp(lin) / (1 + exp(lin))
                                                           
par(mfrow=c(1,2), lwd=2, font.axis=2, bty="l", ps=15)       # figure 8.12
boxplot(t ~ d, ylab="Temperature (F)", xlab="Failure")
plot(temp, prob, type="b", xlab="Temperature (F)",
                      ylab="Probability of failure")

#########################################################################

                                                          # Section 8.4.1
anova(fit, test="Chi")

#########################################################################

                                                          # Section 8.4.2

                                           # multiple logistic regression
                                          
library(rpart)                                    # example Kyphosis data 
attach(kyphosis)

                                                            
par(mfrow=c(1,3), lwd=2, font.axis=2, bty="l", ps=15)       # figure 8.13
boxplot(Age~Kyphosis, ylab="Age", xlab="Kyhosis", main="A")
boxplot(Number~Kyphosis, ylab="Number", xlab="Kyphosis", main="B")
boxplot(Start~Kyphosis, ylab="Start", xlab="Kyphosis", main="C")

#########################################################################

library(rpart)                                    # example Kyphosis data 
attach(kyphosis)
fit <- glm(Kyphosis ~ Age + Number + Start, family="binomial", data=kyphosis)
summary(fit)

#########################################################################

anova(fit, test="Chi")
fit1 <- update(fit, . ~ .- Age)
anova(fit, fit1, test="Chi")

#########################################################################

fit <- glm(Kyphosis ~ Age + Number + Start, 
                                family="binomial", data=kyphosis)
new.d <- data.frame(Age=c(12,24,60), Number=c(2, 4, 6), Start=c(15, 10, 5))
new.p <- round(predict(fit, new.d, type = "response"), 4)
cbind(new.d, new.p)

#########################################################################

library(rpart); library(rms)
data(kyphosis); df <- kyphosis
ddist <- datadist(df); options(datadist='ddist')
fit <- lrm(Kyphosis ~ Age + Number + Start, data=df, x=TRUE, y=TRUE)

par(mfrow=c(1,1), lwd=2, font.axis=2, bty="l", ps=15)       # figure 8.14
nom <- nomogram(fit, fun=function(x)1/(1+exp(-x)), 
                fun.at=c(.01,.05,seq(.1,.9,by=.1),.95,.99),
                lp=FALSE, funlabel="Risk for Kyphosis")
plot(nom, xfrac=.45)

#print(nom)

#########################################################################

                                                          # Section 8.4.4

                                                    
library(rpart)                                      # stepwise regression
library(MASS)

attach(kyphosis)
model <- glm(Kyphosis ~ 1,  family = binomial, data = kyphosis); model

model.step <- stepAIC(model, Kyphosis ~ Age + Number + Start, 
                                         trace = FALSE, direction="both")
model.step$anova

#########################################################################

                                                          # Section 8.4.5

                                                  # analysis of residuals
library(rpart)
attach(kyphosis)
fit <- glm(Kyphosis ~ Age + Number + Start, 
                                 family="binomial", data=kyphosis)
deviance.resid <- residuals(fit)
pearson.resid  <- residuals(fit, type="pearson")
hats <- influence(fit)$hat
idev <- deviance.resid^2 + pearson.resid^2 * hats/(1-hats)
                                                            
par(mfrow=c(1,3), lwd=2, font.axis=2, bty="l", ps=15)       # figure 8.15
plot(deviance.resid, las=1, xlab="Observation", ylab="Deviance residuals")
plot(pearson.resid, las=1, xlab="Observation", ylab="Pearson residuals")
plot(idev, las=1, xlab="Observation", ylab="Influence points)", type="b")

#########################################################################

                                                          # Section 8.4.7

library(rpart)                         # quality of classifivcation - AUC
attach(kyphosis)
fit <- glm(Kyphosis ~ Age + Number + Start, family="binomial", data=kyphosis)
x   <- ifelse(kyphosis$Kyphosis=="present", 1, 0)
p.h <- fit$fitted.values

t   <- seq(0, 1, length=100)
rp  <- sapply(t, function(tcut)        # true positive assignment rp
  { sum(p.h >= tcut & x == 1) / sum(x == 1) } )
t   <- seq(0, 1, length=100)
fp  <- sapply(t, function(tcut)        # false positive assignments fp
  { sum(p.h >= tcut & x == 0) / sum(x == 0) } )

AUC <- 0
for (i in 1:(length(fp)-1)) {
  step <- abs(fp[i+1] - fp[i]); AUC <- AUC + 0.5*step*(rp[i+1]+rp[i]) }
AUC
                                                            # figure 8.16
par(mfrow=c(1,1), lwd=2, font.axis=2, bty="o", ps=15)
plot(fp, rp, type="b", las=1, pch=16, col="blue",
     xlab="1 - specifity", ylab="sensitivity")
abline(a=0, b=1, col="red")
text(0.6, 0.3, paste("AUC=",round(AUC, 4)), cex=1.5)

#########################################################################

library(ROCR)                          # performance() - function in ROCR
pred <- prediction(p.h, x)
as.numeric(performance(pred, measure="auc")@y.values)

#########################################################################

                                                          # Section 8.4.8

psd <- read.csv2("psdata.csv")                # propensity score matching
psd[1:4,]

by(psd$age, psd$treat, summary)
t1 <- wilcox.test(age ~ treat, data=psd); t1
p1 <- round(t1$p.value, 5)

tab <- table(psd$sex, psd$treat); tab
t2 <- chisq.test(tab); t2
p2 <- round(t2$p.value, 5)

by(psd$bmi, psd$treat, summary)
t3 <- wilcox.test(bmi ~ treat, data=psd); t3
p3 <- round(t3$p.value, 5)
                                                           # figure 8.17
par(mfrow=c(1,3), lwd=2, bty="l", font=1, font.axis=3, 
    font.lab=1, ps=16, cex.lab=1.2, oma = c(0, 0, 2, 0))
boxplot(age ~ treat, data=psd, col="grey", las=1,
        names = c("not treated","treated"), xlab=" ",
        ylim=c(18,50), main=paste("age (p = ", p1, ")"))
text(x= 1, y= 18, labels= "n = 1000", cex=1.3)
text(x= 2, y= 18, labels= "n = 100", cex=1.3)
mosaicplot(t(tab), col=c("lightgrey","darkgrey"), 
           cex.axis=1.3, main=paste("gender (p =",p2,")"))
boxplot(bmi ~ treat, data=psd, col="grey", las=1,
        names = c("not treated","treated"), xlab=" ",
        main=paste("body mass index (p = ", p3, ")"))
mtext("- A -", side=3, outer = TRUE, cex = 1.3)

#########################################################################

library(Matching)                          # propensity score 
mod  <- glm(treat ~ age + sex + bmi, family=binomial, data=psd)

psd$yes <- predict(mod, type = "response") # P(treated)
psd$no  <- 1 - psd$yes                     # P(not treated)
                                           # Matching (1:1)
listMatch <- Match(Tr = (psd$treat == 1), 
                   X = log(psd$yes/psd$no), # matching scale
                   M = 1,                   # 1:1 matching
                   caliper  = 0.5,          # 0.5 * SD(X)
                   replace  = FALSE,
                   ties     = TRUE,
                   version  = "fast")
summary(listMatch)

treat <- psd[listMatch$index.treated,]
cntr  <- psd[listMatch$index.control,]
result   <- rbind(treat, cntr)

#########################################################################

                                                    # balance of matching
MatchBalance(treat ~ age + sex + bmi, data=psd, 
             match.out=listMatch, nboots=500)
# write.csv2(result, "psmatched.csv")

#########################################################################

by(result$age, result$treat, summary)
t1 <- wilcox.test(age ~ treat, data=result); t1
p1 <- round(t1$p.value, 5)

tab <- table(result$sex, result$treat); tab
t2 <- chisq.test(tab); t2
p2 <- round(t2$p.value, 5)

by(result$bmi, result$treat, summary)
t3 <- wilcox.test(bmi ~ treat, data=result); t3
p3 <- round(t3$p.value, 5)
                                                            # figure 8.17
par(mfrow=c(1,3), lwd=2, bty="l", font=1, font.axis=3, 
    font.lab=1, ps=15, cex.lab=1.2, oma = c(0, 0, 2, 0))
boxplot(age ~ treat, data=result, col="grey", las=1,
        names = c("not treated","treated"), xlab=" ",
        ylim=c(18,45), main=paste("age (p = ", p1, ")"))
text(x= 1, y= 18, labels= "n = 100", cex=1.3)
text(x= 2, y= 18, labels= "n = 100", cex=1.3)
mosaicplot(t(tab), col=c("lightgrey","darkgrey"), 
           cex.axis=1.3, main=paste("gender (p =",p2,")"))
boxplot(bmi ~ treat, data=result, col="grey", las=1,
        names = c("not treated","treated"), xlab=" ",
        main=paste("body mass index (p = ", p3, ")"))
mtext("- B -", side=3, outer = TRUE, cex = 1.3)

#########################################################################

                                                            # Section 8.5

                                   # Poisson regression, log linear model

                                                          # Section 8.5.1

                                          # example - mating of elephants
age     <- c(27,  28,  28,  28,  28,  29,  29,  29,  29,  29,  29,
             30,  32,  33,  33,  33,  33,  33,  34,  34,  34,  34,
             36,  36,  37,  37,  37,  38,  39,  41,  42,  43,  43,
             43,  43,  43,  44,  45,  47,  48,  52)
matings <- c(0,   1,   1,   1,   3,   0,   0,   0,   2,   2,   2,
             1,   2,   4,   3,   3,   3,   2,   1,   1,   2,   3,
             5,   6,   1,   1,   6,   2,   1,   3,   4,   0,   2,
             3,   4,   9,   3,   5,   7,   2,   9)

elephants <- as.data.frame(cbind(age, matings))
linear.mod <- lm(matings ~ age, data=elephants)     # simple linear model
summary(linear.mod)

                                                    # Poisson regression 
poiss.mod <- glm(matings ~ age, family=poisson, data=elephants)
summary(poiss.mod)
                                                              
residuals(poiss.mod, type="pearson")                          # residuals
new  <- data.frame(age = seq(27, 52, 1))
linp <- predict(linear.mod, new)
poip <- predict(poiss.mod, new, type = "response")

                                                            # figure 8.18
par(lwd=2, font.axis=2, bty="n", ps=15, mfrow=c(1,2))   
plot(age, matings, xlab="Alter", ylab="Successful matings",
     las=1, xlim = c(25, 55), ylim = c(0, 10))
lines(new$age, linp, col="grey", lty=2)
lines(new$age, poip, col="blue")
plot(age, residuals(poiss.mod), xlab="Age", ylab="Residuals",
     las=1, xlim = c(25, 55), ylim = c(-3, +3))
abline(h=0, col="grey", lty=2)

#########################################################################

                                                       # dispersion index
DP <- sum(residuals(poiss.mod, type="pearson")^2)/poiss.mod$df.res; DP
summary(poiss.mod, dispersion=DP)

#########################################################################

                                                          # Section 8.5.2

                                    # Poisson regression skin cancer risk
rates <- read.csv("skin cancer data.csv", sep = ";", dec = ",")
rates$city <- relevel(as.factor(rates$city), ref="Minneapolis")
mod   <- glm(cases ~ as.factor(rates$age) + city + offset(log(popul)) - 1,
           family="poisson", data=rates); summary(mod)

                                         
alpha <- c(mod$coefficients[1:8])        # coefficients
delta <- mod$coefficients[9]; rr <- exp(delta)
estim1 <- exp(alpha)*1000                # incidence rates Mineapolis
estim2 <- exp(alpha + delta)*1000        # incidence rates Dallas

city1  <- rates[rates$city=="Minneapolis",]
city2  <- rates[rates$city=="Dallas",]
resid1 <- city1$incid - estim1; resid2 <- city2$incid - estim2

par(lwd=1.5, font.axis=2, bty="n", ps=15, mfrow=c(1,1))  
plot(city2$age, city2$incid, type="l", las=1, lty=1, pch=1, cex=1.2,
     xlab="Age", ylab="Incidence rate",
     ylim=c(0.007, 10), log="xy")
abline(h=c(0.01,0.05,0.10,0.5,1.0,5.0,10.0), lty=3, col="grey")
points(city2$age, estim2, pch=16, cex=2, col="red")
lines(city1$age, city1$incid, type="l", lty=2, pch=2, cex=1.2)
points(city1$age, estim1, pch=17, cex=2, col="red")

#########################################################################

                                                          # Section 8.5.3

                                             # contingency table analysis

                                                            # figure 8.19
par(mfrow=c(1,2), lwd=2, font.axis=3, bty="l", ps=15) 
y <- c(86, 19, 18, 170, 43, 20, 40, 11, 5, 28, 4, 3)
tab1 <- matrix(y, byrow=TRUE, nrow = 4)                       # example A 
dimnames(tab1) <- list(Education=c("K","L","F","H"), 
                Duration=c("short","medium","long"))
mosaicplot(tab1, col=c("gray"), main = "Unemployment")

y <- c(911, 44, 538, 456, 3, 2, 43, 279)
tab2 <- array(y, dim = c(2,2,2))                              # example B 
dimnames(tab2) <- list(Cigarettes=c("yes","no"),
                Marijuana=c("yes","no"), Alcohol=c("Alcoh.yes","Alcoh.no"))
mosaicplot(tab2, col="gray", off=c(5,5,5), main="Drug abuse")

#########################################################################

y <- c(86, 19, 18, 170, 43, 20, 40, 11, 5, 28, 4, 3)
n <- sum(y)
tab <- matrix(y, byrow=TRUE, nrow = 4)     # table example A       
dimnames(tab) <- list(Education=c("K","L","F","H"), Duration=c("s","m","l"))
tab
zeit.sum <- apply(tab, 2, sum)             # marginal sums                  
ausb.sum <- apply(tab, 1, sum)   
L.sat <- -2*sum(y*log(y/n)); L.sat         # saturated model          
L.c <- c(0)
for (i in 1:4) { for (j in 1:3) {          # fixed marginla sums            
    L.c <- L.c + tab[i, j] * log(ausb.sum[i]*zeit.sum[j]/n^2)  }}
    L.c <- -2*L.c;   L.c                   # restricted model         
devianz <- L.c - L.sat; devianz
1-pchisq(devianz, 6)

#########################################################################

chisq.test(tab)

#########################################################################
  
                                                          # Section 8.5.4
                                                        
                                                        # loglinear model
y <- c(86, 19, 18, 170, 43, 20, 40, 11, 5, 28, 4, 3)
education <- c(rep("K",3), rep("L",3), rep("F",3),rep("H",3))
time      <- rep(c("k","m","l"),4)
tab       <- data.frame(education, time, y)
fit.sat <- glm(y ~ time + education + time:education, family=poisson, data=tab)
fit.c   <- update(fit.sat, . ~ . - time:education)
anova(fit.sat, fit.c)

#########################################################################

y <- c(86, 19, 18, 170, 43, 20, 40, 11, 5, 28, 4, 3)
education  <- c(rep("K",3), rep("L",3), rep("F",3), rep("H",3))
time       <- rep(c("k","m","l"),4)
tab     <- data.frame(education, time, y)
fit     <- glm(y ~ time*education, family=poisson, data=tab)
summary(fit)

#########################################################################

                                                          # Section 8.5.5
                                                        
                                    
library(MASS)                       # three-dimensional contingency table
y <- c(911, 538, 44, 456, 3, 43, 2, 279)
marihuana <- rep(c("ja","nein"), 4)
cigarette <- rep(c("ja","ja","nein","nein"),2)
alcohol   <- c(rep("ja",4), rep("nein",4))
tab       <- data.frame(marihuana, cigarette, alcohol, y)

model <- glm(y ~ marihuana + cigarette + alcohol,  
                                  family = poisson, data = tab); model

#########################################################################

                                                        # Section 8.5.5.2

                                                     # loglinear approach
val        <- matrix(nrow=8, ncol=9); stats      <- matrix(nrow=9, ncol=4)

fit.a      <- glm(y ~ marihuana + cigarette + alcohol, family=poisson, data=tab)
val[,1]    <- round(fitted.values(fit.a), 1)
stats[1,1] <- round(fit.a$deviance, 1); stats[1,2] <- round(fit.a$aic, 1)
stats[1,3] <- fit.a$df.residual

fit.b1 <- update(fit.a, ~ . + alcohol:cigarette, family=poisson, data=tab)
val[,2] <- round(fitted.values(fit.b1), 1)
stats[2,1] <- round(fit.b1$deviance, 1); stats[2,2] <- round(fit.b1$aic, 1)
stats[2,3] <- fit.b1$df.residual

fit.b2 <- update(fit.a, ~ . + alcohol:marihuana, family=poisson, data=tab)
val[,3] <- round(fitted.values(fit.b2), 1)
stats[3,1] <- round(fit.b2$deviance, 1); stats[3,2] <- round(fit.b2$aic, 1)
stats[3,3] <- fit.b2$df.residual

fit.b3 <- update(fit.a, ~ . + cigarette:marihuana, family=poisson, data=tab)
val[,4] <- round(fitted.values(fit.b3), 1)
stats[4,1] <- round(fit.b3$deviance, 1); stats[4,2] <- round(fit.b3$aic, 1)
stats[4,3] <- fit.b3$df.residual

fit.c1 <- update(fit.a, ~ . + alcohol:marihuana + cigarette:marihuana , 
                 family=poisson, data=tab)
val[,5] <- round(fitted.values(fit.c1), 1)
stats[5,1] <- round(fit.c1$deviance, 1); stats[5,2] <- round(fit.c1$aic, 1)
stats[5,3] <- fit.c1$df.residual

fit.c2 <- update(fit.a, ~ . + alcohol:marihuana + alcohol:cigarette , 
                 family=poisson, data=tab)
val[,6] <- round(fitted.values(fit.c2), 1)
stats[6,1] <- round(fit.c2$deviance, 1); stats[6,2] <- round(fit.c2$aic, 1)
stats[6,3] <- fit.c2$df.residual

fit.c3 <- update(fit.a, ~ . + alcohol:cigarette + cigarette:marihuana , 
                 family=poisson, data=tab)
val[,7] <- round(fitted.values(fit.c3), 1)
stats[7,1] <- round(fit.c3$deviance, 1); stats[7,2] <- round(fit.c3$aic, 1)
stats[7,3] <- fit.c3$df.residual

fit.d <- update(fit.a, ~ . + alcohol:cigarette + cigarette:marihuana + alcohol:marihuana, 
                family=poisson, data=tab)
val[,8] <- round(fitted.values(fit.d), 1)
stats[8,1] <- round(fit.d$deviance, 1); stats[8,2] <- round(fit.d$aic, 1)
stats[8,3] <- fit.d$df.residual

fit.0 <- glm(y ~ marihuana*cigarette*alcohol, family=poisson, data=tab)
val[,9] <- round(fitted.values(fit.0), 1)
stats[9,1] <- round(fit.0$deviance, 1); stats[9,2] <- round(fit.0$aic, 1)
stats[9,3] <- fit.0$df.residual

for (i in 1:9) {stats[i,4] <- round(pchisq(stats[i,1], 
                stats[i,3], lower.tail = FALSE), 10)}
dimnames(stats) <- list(Modell=c("A","B1","B2","B3","C1","C2","C3","D","0"),
                Statistik=c("Deviance","AIC","DF","P-value"))
stats

#########################################################################
                                                       
chi <- rep(NA, 9)                                          # Tabelle 8.8
for (i in 1:8) {chi[i] <- sum((val[,9] - val[,i])^2 / val[,i])}
val <- rbind(val, chi)
dimnames(val) <- list(Field=c(1,2,3,4,5,6,7,8,"chi"), 
                   Modell=c("A","B1","B2","B3","C1","C2","C3","D","0"))
val

#########################################################################

library(MASS)                     # variable selection in loglinear model
y <- c(911, 538, 44, 456, 3, 43, 2, 279)
marihuana <- rep(c("ja","nein"), 4)
cigarette <- rep(c("ja","ja","nein","nein"),2)
alcohol   <- c(rep("ja",4), rep("nein",4))
tab       <- data.frame(marihuana, cigarette, alcohol, y)

model.step <- stepAIC(model, list(upper = ~ .^3, 
                      lower = formula(model)), trace=FALSE)
model.step$anova

#########################################################################

                                           # interpretation of parameters
y <- c(911, 538, 44, 456, 3, 43, 2, 279)
m <- rep(c("ja","nein"), 4)
c <- rep(c("ja","ja","nein","nein"),2)
a   <- c(rep("ja",4), rep("nein",4))
tab <- data.frame(m, c, a, y)

fit <-glm(y ~ m * a * c - m:c:a, family=poisson, data=tab, x=T)
summary(fit)

# fit$x

#########################################################################

                                                            # Section 8.6

                                                  # repeated measurements 

                                                  # example - wide format  
wide <-read.table("lme data broad.csv", header=T, sep=";")
                                                  # example - long format    
long <- reshape(wide, direction="long", idvar = "case", timevar = "time",
                          v.names="y", varying=list(3:6))
str(long); attach(long)

                                                            # figure 8.21
par(lwd=2, font.axis=2, bty="l", ps=15, mfrow=c(1,1))  
interaction.plot(long$time, long$case, long$y, las=1, 
            ylab = "Dependent variable (y)",lwd=2, xlab = "Time")
                                                 
#########################################################################

                                                          # Section 8.6.1
                                 
                            # ANOVA repeated measurements - without group
long.aov1 <- aov(long$y ~ as.factor(long$time) + Error(factor(long$case)))
summary(long.aov1)

                            # ANOVA repeated measurements - with group   
long.aov2 <- aov(long$y ~ factor(long$group) * factor(long$time) 
                 + Error(factor(case)))
summary(long.aov2)
                           
g1 <- long[group==1,]; g2 <- long[group==2,] 
                                                            # figure 8.22
par(lwd=2, font.axis=2, bty="l", ps=15, mfrow=c(1,2))  
interaction.plot(g1$time, g1$case, g1$y, ylab = "Dependent variable (y)",
               ylim=c(10,40), las=1, lwd=2, main="Group 1", xlab = "Time")
interaction.plot(g2$time, g2$case, g2$y, ylab = "Dependent variable (y)",
               ylim=c(10,40), las=1, lwd=2, main="Group 2", xlab = "Time")

#########################################################################

                                                          # Section 8.6.2

                                                     # linear mixed model

                                      # linear regression in single cases
int  <- by(long, case, 
        function(data) coefficients(lm(y ~ time, data = data))[[1]])    
rate <- by(long, case, 
        function(data) coefficients(lm(y ~ time, data = data))[[2]])     
r <- as.data.frame(cbind(Case=wide$case, Group=wide$group, 
                         unlist(int), unlist(rate))); r
                                                         
                                                           # lattice plot
library(lattice); par(cex=2)
                                                           # figure 8.23
xyplot(y ~ time | case, data=long, aspect=1/2,
       xlab="Time", ylab="Dependent variable (y)", 
       lattice.options = list(lwd=3),
       strip = strip.custom(strip.names = TRUE, strip.levels = TRUE),
       panel = function(x, y, subscripts){
               panel.xyplot(x, y)
               panel.lmline(x, y)}, as.table=T)      

#########################################################################

library(nlme)                                              # grouped data 
long.new <-                   
  groupedData( y ~ time | case, data = long, outer = ~ as.factor(group),
          labels = list( x = "Time", y = "Dependent variable (y)" ))
long.lis <- lmList(y ~ time | case, data=long.new); long.lis
              
plot(long.new, outer = ~group)                # trellis plot by Subject  

predict(long.lis, se.fit = TRUE)

lme(long.lis)

#########################################################################

                                                # model 1 (without group)
mod1.lme <- lme( y ~ time , method="REML", data = long, random = ~ time | case)
summary(mod1.lme)

par(lwd=1.5, font.axis=2, bty="l", ps=12, mfrow=c(1,1)) 
plot(mod1.lme, fitted(.) ~ time | case, aspect=1/2, type="b")

par(lwd=1.5, font.axis=2, bty="l", ps=12, mfrow=c(1,1)) 
plot(residuals(mod1.lme))
abline(h=0, lty=2)

coef(mod1.lme)
resid(mod1.lme); fitted(mod1.lme)
par(lwd=1.5, font.axis=2, bty="l", ps=12, mfrow=c(1,1)) 
plot(long$y, resid(mod1.lme))
abline(h=0, lty=2)

fixed.effects(mod1.lme); random.effects(mod1.lme)

#########################################################################

                                                   # model 2 (with group) 
mod2.lme <- lme( y ~ time*as.factor(group) , method="REML", 
                                  data = long, random = ~ time | case)
summary(mod2.lme)

par(lwd=1.5, font.axis=2, bty="l", ps=12, mfrow=c(1,1)) 
plot(mod2.lme, fitted(.) ~ time | case, aspect=1/2, type="b",
     strip = strip.custom(strip.names = TRUE, strip.levels = TRUE),
     ylab="Dependent variable (esimated)", xlab="Time")

par(lwd=1.5, font.axis=2, bty="l", ps=12, mfrow=c(1,1)) 
plot(residuals(mod2.lme)); abline(h=0, lty=2)

coef(mod2.lme)
resid(mod2.lme); fitted(mod2.lme)
par(lwd=1.5, font.axis=2, bty="l", ps=12, mfrow=c(1,1)) 
plot(long$y, resid(mod2.lme)); abline(h=0, lty=2)

fixed.effects(mod2.lme); random.effects(mod2.lme)
                
#########################################################################

                                                 # plot residuals model 2
                                               
                                                            # figure 8.24
par(lwd=2, font.axis=2, bty="l", ps=15, mfrow=c(1,2)) 
fit <- unlist(fitted.values(mod2.lme))
interaction.plot(long$time, long$group, fit, xlab="Time", lwd=2, las=1,
                           ylab="Dependent variable (y)", lty=1:2)
plot(residuals(mod2.lme), las=1, ylab="Residual", xlab="Observation")
abline(h=0, lty=2)

#########################################################################

                                                       # comparing models
mod2.lme  <- lme( y ~ time * as.factor(group) , method="ML", 
                  data = long, random = ~ time | case)
mod2a.lme <- lme( y ~ time * as.factor(group) , method="ML", 
                  data = long, random = ~ 1 | case)                                     
anova(mod2.lme, mod2a.lme)                                                 


#########################################################################

                                                          # Section 8.6.3
                                                        
                                             # analysis of clustered data

                                    # random sampling from clustered data
cluster.smpl <- function(dfm, id, k = 1) {          # dfm data frame
                                                    # id identifier
                                                    # k number of cases
do.call("rbind", lapply(split(dfm, dfm[,id]), 
        function(x) { x[sample(seq_len(nrow(x)), size=k), ] } ))  }

dat <- read.csv("clustered data.csv", header = TRUE, dec=",", sep = ";")
cluster.smpl(dat, 2, 1)
                         
sort.data <- function(d, col) {                     # swort data      
    sd <- do.call("order", as.data.frame(d[ ,col]))
    d[sd, ]                 }

#########################################################################
                                               
                                        # example body height in families                          
height <- read.csv("clustered data.csv", header = TRUE, dec=",", sep = ";")
height <- sort.data(height, 2)          # sort by family                   

#########################################################################

                                                  # example data ('wide')               
wheight <- reshape(height, v.names=c("gender", "height"), 
                   idvar="family", timevar="case", direction="wide")
wheight
attach(wheight)                                   # correlations
cor(cbind(height.1, height.2, height.3, height.4, height.5))

#########################################################################
                                   
                                            # analysis of variance -  ICC
var.tab <- summary(aov(height ~ as.factor(family), height)); var.tab
msq <- var.tab[[1]][,3];    BMS <- msq[1];    WMS <- msq[2];   m   <- 5
ICC <- (BMS - WMS)/(BMS + (m-1) * WMS); ICC
VIF <- 1 + (m-1) * ICC; VIF

#########################################################################
                           
library(nlme)                     # linear mixed model for clustered data
summary(lme(height ~ gender, data = height, random = ~ 1 | family, method="REML"))
                                 
#########################################################################
                
                              # t-Test (without regarding clustered data)
t.test(height ~ gender, height)

#########################################################################
                             
VIF.ICC <- function(x, m) {           # ICC and VIF from dichotomous data
    n   <- sum(m); k <- length(m)
    mm  <- mean(m)
    m0  <- mm - sum((m - mm)**2) / (k * (k - 1) * mm)
    p <- x/m
    pm  <- sum(x)/sum(m)
    BMS <- sum((x - m * pm)**2 / m) / k
    WMS <- sum( x * (m - x)  / m) / (k * (pm - 1))
    ICC <- (BMS - WMS) / (BMS - (m0 - 1)*WMS)
    VIF <- 1 + (m0-1)*ICC
    cat("ICC = ", round(ICC,4), "sowie VIF = ", round(VIF,4),"\n")   }  

#########################################################################

                                                # example - toxicity data
surv.c <- matrix(c(13,12, 9, 9, 8, 8,12,11, 9, 9, 8,11, 4, 5, 7, 7,
                    0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 1, 2, 3, 3), 
            nrow=2, byrow=TRUE, dimnames = list(c("ja","nein"),1:16))
surv.t <- matrix(c(12,11,10, 9,10, 9, 9, 8, 8, 4, 7, 4, 5, 3, 3, 0,
                    0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 3, 5, 3, 7, 7),
            nrow=2, byrow=TRUE, dimnames = list(c("ja","nein"),1:16))
            
                                            # mean litter lactation index       
LIc <- surv.c[1,]/apply(surv.c, 2, sum); mean(LIc)
LIt <- surv.t[1,]/apply(surv.t, 2, sum); mean(LIt)

                                             
VIF.ICC(surv.c[2,], apply(surv.c, 2, sum))  # lactation index ICC und VIF 
VIF.ICC(surv.t[2,], apply(surv.t, 2, sum))

#########################################################################

                                # transform data from wide to long format              
i.c  <- apply(surv.c, 2, sum); n.c <- sum(i.c)
cdat <- as.data.frame(matrix(rep(0, 4*n.c), nrow=n.c, byrow=TRUE))
colnames(cdat) <- c("group","litter","id","status"); cnt  <- 1
for (i in 1:16) {
    if (surv.c[1,i]>0)
    for (k in 1:surv.c[1,i]) 
            {cdat[cnt,] <- c("control", i, k, 0); cnt <- cnt + 1 }
    if (surv.c[2,i]>0)
    for (k in 1:surv.c[2,i]) 
            {cdat[cnt,] <- c("control", i, k, 1); cnt <- cnt + 1 }  
    }  
i.t  <- apply(surv.t, 2, sum); n.t <- sum(i.t)
tdat <- as.data.frame(matrix(rep(NA, 4*n.t), nrow=n.t, byrow=TRUE))
colnames(tdat) <- c("group","litter","id","status"); cnt  <- 0
for (i in 1:16) {
    if (surv.t[1,i]>0)
    for (k in 1:surv.t[1,i]) 
            {cnt <- cnt + 1; tdat[cnt,] <- c("treatment", i+16, k, 0)}
    if (surv.t[2,i]>0)
    for (k in 1:surv.t[2,i]) 
            {cnt <- cnt + 1; tdat[cnt,] <- c("treatment", i+16, k, 1)}  
    }  

Sdat        <- rbind(cdat, tdat)
Sdat$group <- as.factor(Sdat$group)
Sdat$litter   <- as.numeric(Sdat$litter)
Sdat$id   <- as.numeric(Sdat$id)
Sdat$status <- as.numeric(Sdat$status)

write.table(Sdat, file="toxicity study.csv")

#########################################################################

                                                # simple chi-squared test    
Sdat  <- read.table(file="toxicity study.csv")        
attach(Sdat); tab <- table(group, status); tab

test   <- chisq.test(tab); test

VIF    <- 2.3                              # computation see above (mean)        
p.corr <- pchisq(as.numeric(test$statistic) / VIF, 1, lower.tail=FALSE)
p.corr

#########################################################################

library(MASS)               # generalized linear miced model cluster data
Sdat  <- read.table(file="toxicity study.csv")  
summary(glmmPQL(status ~ group, random = ~ 1 | litter, 
                                     family = binomial, data = Sdat))
                     
#########################################################################

                                                          # Section 8.6.4
                                                        
                       # generalized estimating equations GEE (example 1)
height <- read.csv("clustered data.csv", header = TRUE, dec=",", sep = ";")
height <- height[order(height$family),]                   
library(geepack)
summary(geese(height ~ gender, id = family, data=height, 
                             corstr="exchangeable", family=gaussian))

#########################################################################

                       # generalized estimating equations GEE (example 2)
Sdat  <- read.table(file="teratogen.csv")         
attach(Sdat)
library(geepack)
summary(geese(status ~ group, id = litter, data=Sdat, 
                             corstr="exchangeable", family=binomial))

# mod <- geeglm(Status ~ group, id = litter, data=Sdat, 
#                              corstr="exchangeable", family=binomial)
# res <- residuals(mod)
# pred.probability <- predict(mod, type = "response")  
# pred.logit <- predict(mod, type = "link")

#########################################################################

                                                          # Section 8.7.1

                                                 # survival time analysis
library(survival)
library(muhaz)
                                                   # example chemotherapy
t1 <- c(26,  50,  51,  57,  70,  93, 105, 108, 135, 193,
       229, 241, 242, 263, 455, 489, 518, 566, 582, 595)
z1 <- c( 0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
         0,  0,  1,  1,  0,  0,  1,  0,  1,  1); c1 <- rep(1, length(t1))
t2 <- c( 4,   8,  10,  18,  30,  55,  56,  71, 89, 90, 
       101, 148, 155, 207, 233, 266, 283, 441)       
z2 <- c( 0,  0,  0,  0,  1,  1,  0,  0,  1,  1,
         1,  1,  1,  0,  1,  0,  1,  0);         c2 <- rep(2, length(t2))
group <- c(c1,c2); time <- c(t1,t2); status <- c(z1,z2);
chemo <- data.frame(group, time, status); chemo[1:5,]

                                                             # table 8.11
tab <- summary(survfit(Surv(time, status) ~ group,  type='kaplan-meier',
            conf.type="plain", data=chemo)); tab

#########################################################################

                                                            # figure 8.27
fit <- survfit(Surv(time, status) ~ group,  type='kaplan-meier',
                    conf.type="plain", data=chemo)
par(mfcol=c(1,2),lwd=2, font.axis=2, bty="n", ps=15)
plot(fit[1], bg=0, xlim=c(0,600), ylim=c(0,1), las=1,
                xlab="Time [days]", ylab="S(t)", main=expression(C[1]))
abline(h=0.5, lty=3)
plot(fit[2], xlim=c(0,600), ylim=c(0,1), las=1,
                xlab="Time [days]", ylab="S(t)", main=expression(C[2]))
abline(h=0.5, lty=3)

                                               
library(survival)                              # mean and median survival
fit <- survfit(Surv(time, status) ~ group, data=chemo)
print(fit,  print.rmean=TRUE)

fit$time[min(which(fit$surv<=0.5))]                        # median value

#########################################################################

                                                 # plot cumulative hazard
par(mfrow=c(1,1), lwd=2, font.axis=4, bty="l", ps=15) 
fit <- survfit(Surv(time, status) ~ group, data=chemo)
H   <- -log(fit$surv)
plot(fit, conf.int=FALSE, col="grey", xlim=c(0,600), ylim=c(0,1), las=1, 
      main="Hazard plot", lty=c(1,2), xlab="Time [days]", ylab="S(t)")   
lines(time[group==1], H[1:20], lty=1)
lines(time[group==2], H[21:38], lty=2)

#########################################################################

                                                          # Section 8.7.2
                                                                                                                
library(survival)                                          # logrank test   
fit <- survfit(Surv(time, status) ~ group, data=chemo)

                                                          # figure 8.28 
par(mfcol=c(1,1), lwd=2, font.axis=2, bty="n", ps=14) 
plot(fit, col=c(1,1), xlim=c(0,600), ylim=c(0,1), las=1, mark.time=TRUE,
            lty=c(1,2), xlab="Time [days]", ylab="S(t)")
legend("topright", c("Therapy 1","Therapy 2"),
            bty = "n", lty=c(1,2), col=c(1,1), cex=1)

survdiff(Surv(time, status) ~ group, data=chemo, rho=0)    # logrank test

# survdiff(Surv(time, status) ~ group, data=chemo, rho=1) # Wilcoxon test

#########################################################################

                                                          # Section 8.7.3

                        # parametric regression models for survival times

                                            # example data - chemotherapy
t1 <- c(26,  50,  51,  57,  70,  93, 105, 108, 135, 193,
        229, 241, 242, 263, 455, 489, 518, 566, 582, 595)
z1 <- c( 0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
         0,  0,  1,  1,  0,  0,  1,  0,  1,  1); c1 <- rep(1, length(t1))
t2 <- c( 4,   8,  10,  18,  30,  55,  56,  71, 89, 90,
         101, 148, 155, 207, 233, 266, 283, 441)
z2 <- c( 0,  0,  0,  0,  1,  1,  0,  0,  1,  1,
         1,  1,  1,  0,  1,  0,  1,  0);         c2 <- rep(2, length(t2))
group <- c(c1,c2); time <- c(t1,t2); status <- c(z1,z2);
chemo <- data.frame(group, time, status); chemo[1:10,]

library(survival); attach(chemo)
nc <- length(time)

#########################################################################

                                              
chemo$group <- as.factor(chemo$group)          # exponential distribution
fit1 <- survreg(Surv(time, status) ~ group,      
                data = chemo, dist = "exponential")
print(summary(fit1))
alpha.0 <- fit1$coefficients[1]                        # coefficients
alpha.1 <- fit1$coefficients[2]
lp.1 <- alpha.0 + alpha.1 * 0                          # linear estimator                
lp.2 <- alpha.0 + alpha.1 * 1 
erw.1    <- exp(lp.1); erw.1                           # expected values
erw.2    <- exp(lp.2); erw.2

                                                           
t  <- seq(0,600,by=10)                                      # figure 8.30                   
hazard.1  <- exp(-lp.1);  hazard.2  <- exp(-lp.2)       # hazard function       
par(mfcol=c(1,2), lwd=2, font.axis=1.5, bty="n", ps=15, bty="l")
plot(t, rep(hazard.1, length(t)), type="l",
     main="Hazard function - exponential model",
     xlim=c(0,600), ylim=c(0,0.012), las=1, lty=1,
     xlab="Time [days]", ylab=" ")
lines(t, rep(hazard.2, length(t)), lty=2)
legend("topright", c("Therapy 1","Therapy 2"),
       bty = "n", lty=c(1,2), col=c(1,1), cex=1)
abline(h=seq(0, 0.012, 0.002), col="grey", lty=3)

s.1 <- exp(-hazard.1 * t)                             # survival function
s.2 <- exp(-hazard.2 * t)                 
fitKM <- survfit(Surv(time, status) ~ group, data=chemo)
plot(fitKM, col=c(1,1), xlim=c(0,600), ylim=c(0,1), las=1,
     main="Survival function - exponential model",
     lty=c(1,2), xlab="Time [days]", ylab="S(t)")
abline(h=seq(0, 1, 0.2), col="grey", lty=3)
legend("topright", c("Therapy 1","Therapy 2"),
       bty = "n", lty=c(1,2), col=c(1,1), cex=1)
lines(t, s.1, lty=1)
lines(t, s.2, lty=2)

#########################################################################

                                                         # Gompertz model
library(readxl)                                     # read dat from EXCEL
dtab <- as.data.frame(read_excel("Death rates 2015-17.xlsx"))
males   <- dtab[dtab$gender == 1,]
females <- dtab[dtab$gender == 2,]

par(mfcol=c(1,2), font.axis=2, bty="n", ps=15)              # figure 8.31
plot(males$age, log10(males$dprob), type="l", las=1, col="blue",
     ylab="Log(mortality/100.000)", xlab="Age [years]",
     ylim=c(-5, 0), lwd=2, lty=1)
lines(females$age, log10(females$dprob), col="red", lty=5)
legend("bottomright", c("male","female"), bty="n", 
       lty=c(1,5), col=c("blue","red"))
abline(v=30, col="grey")                             # model (>=30 years)
mage30   <- males$age[males$age >= 30];     
mdpr30   <- males$dprob[males$age >= 30]
fage30   <- females$age[females$age >= 30]; 
fdpr30   <- females$dprob[females$age >= 30]
abline(lm(log10(mdpr30) ~ mage30), col="grey", lty=3, cex=0.3, lwd=2)
abline(lm(log10(fdpr30) ~ fage30), col="grey", lty=3, cex=0.3, lwd=2)
mmodel   <- lm(log(mdpr30) ~ mage30);  
fmodel   <- lm(log(fdpr30) ~ fage30)
mlambda  <- exp(mmodel$coeff[1]);      
flambda  <- exp(fmodel$coeff[1])
malpha   <- mmodel$coeff[2];           
falpha   <- fmodel$coeff[2]
mmht     <- mlambda * exp(malpha*mage30)                 # Hazard males
fmht     <- flambda * exp(falpha*fage30)                 # Hazard females

par(lwd=2, font.axis=2, bty="l", ps=14, lab=c(10,5,2), las=1)
t   <- seq(0, 100, by=1)
mS  <- exp(mlambda/malpha *(1-exp(malpha*t)))
fS  <- exp(flambda/falpha *(1-exp(falpha*t)))
plot(t, mS, type="l", ylab="Survival probability", lty=1,
     xlab="Age [years]", xlim=c(0,100), ylim=c(0,1), col="blue")
lines(t, fS, col="Red", lty=5)
legend("bottomleft", c("male","female"), bty="n", lty=c(1,5), col=c("blue","red"))


#########################################################################

                                                          # Weibull model
                                            # example data - chemotherapy
t1 <- c(26,  50,  51,  57,  70,  93, 105, 108, 135, 193,
        229, 241, 242, 263, 455, 489, 518, 566, 582, 595)
z1 <- c( 0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
         0,  0,  1,  1,  0,  0,  1,  0,  1,  1); c1 <- rep(1, length(t1))
t2 <- c( 4,   8,  10,  18,  30,  55,  56,  71, 89, 90,
         101, 148, 155, 207, 233, 266, 283, 441)
z2 <- c( 0,  0,  0,  0,  1,  1,  0,  0,  1,  1,
         1,  1,  1,  0,  1,  0,  1,  0);         c2 <- rep(2, length(t2))
group <- c(c1,c2); time <- c(t1,t2); status <- c(z1,z2);
chemo <- data.frame(group, time, status); chemo[1:10,]
library(survival); attach(chemo)

chemo$group <- as.factor(chemo$group)
fit2 <- survreg(Surv(time, status) ~ group, data = chemo, dist = "weibull")
print(summary(fit2))
alpha.0 <- fit2$coefficients[1]                      # coefficients
alpha.1 <- fit2$coefficients[2]
scale   <- fit2$scale; shape   <- 1/scale
lp.1 <- alpha.0 + alpha.1 * 0                        # linear estimator                  
lp.2 <- alpha.0 + alpha.1 * 1                   
erw.1    <- exp(lp.1) * gamma(1 + scale); erw.1      # expected values                                                     # Erwartungswerte
erw.2    <- exp(lp.2) * gamma(1 + scale); erw.2

                                                     
t  <- seq(0,600,by=10)                                      # figure 8.32
hazard.1  <- shape * exp(-lp.1)^shape * t^(shape-1)     # hazard function
hazard.2  <- shape * exp(-lp.2)^shape * t^(shape-1)  

par(mfcol=c(1,2), lwd=2, font.axis=1.5, bty="n", ps=15, bty="l")
plot(t, hazard.1, type="l",
     main="Hazard function - Weibull model",
     xlim=c(0,600), ylim=c(0,0.012), las=1, lty=1,
     xlab="Zeit in Tagen", ylab=" ")
lines(t, hazard.2, lty=2)
legend("topright", c("Therapy 1","Therapy 2"),
       bty = "n", lty=c(1,2), col=c(1,1), cex=1)
abline(h=seq(0, 0.012, 0.002), col="grey", lty=3)

s.1 <- exp(-(t*exp(-lp.1))^shape)                     # survival function 
s.2 <- exp(-(t*exp(-lp.2))^shape)              
fitKM <- survfit(Surv(time, status) ~ group, data=chemo)
plot(fitKM, col=c(1,1), xlim=c(0,600), ylim=c(0,1), las=1,
     main="Survival function - Weibull model",
     lty=c(1,2), xlab="Time [days]", ylab="Survival - S(t)")
abline(h=seq(0, 1, 0.2), col="grey", lty=3)
legend("topright", c("Therapy 1","Therapy 2"),
       bty = "n", lty=c(1,2), col=c(1,1), cex=1)
lines(t, s.1, lty=1)
lines(t, s.2, lty=2)

#########################################################################

chemo$group <- as.factor(chemo$group)              # log-logistic model
fit3 <- survreg(Surv(time, status) ~ group,      
                data = chemo, dist = "loglogistic")
print(summary(fit3))
alpha.0 <- fit3$coefficients[1]                  # coefficients
alpha.1 <- fit3$coefficients[2]
scale   <- fit3$scale; shape <- 1/scale
lp.1 <- alpha.0 + alpha.1 * 0                    # linear estimator                 
lp.2 <- alpha.0 + alpha.1 * 1    
erw.1 <- exp(lp.1); erw.1                        # expected values
erw.2 <- exp(lp.2); erw.2

                                                  

t  <- seq(0,600,by=10)                                      # figure 8.33                 
                                                        # hazard function
hazard.1 <- (shape * exp(-lp.1)^shape * t^(shape-1))/(1 + (exp(-lp.1)*t)^shape)
hazard.2 <- (shape * exp(-lp.2)^shape * t^(shape-1))/(1 + (exp(-lp.2)*t)^shape)

par(mfcol=c(1,2), lwd=2, font.axis=1.5, bty="n", ps=15, bty="l")
plot(t, hazard.1, type="l",
     main="Hazard function - log-logistic model",
     xlim=c(0,600), ylim=c(0,0.012), las=1, lty=1,
     xlab="Time [days]", ylab=" ")
lines(t, hazard.2, lty=2)
legend("topright", c("Therapy 1","Therapy 2"),
       bty = "n", lty=c(1,2), col=c(1,1), cex=1)
abline(h=seq(0, 0.012, 0.002), col="grey", lty=3)

s.1 <- 1 / (1 + (exp(-lp.1)*t)^shape)                 # survival function 
s.2 <- 1 / (1 + (exp(-lp.2)*t)^shape)                 

fitKM <- survfit(Surv(time, status) ~ group, data=chemo)
plot(fitKM, col=c(1,1), xlim=c(0,600), ylim=c(0,1), las=1,
     main="Survival function - log-logistic model",
     lty=c(1,2), xlab="Time [days]", ylab="Survival - S(t)")
abline(h=seq(0, 1, 0.2), col="grey", lty=3)
legend("topright", c("Therapy 1","Therapy 2"),
       bty = "n", lty=c(1,2), col=c(1,1), cex=1)
lines(t, s.1, lty=1)
lines(t, s.2, lty=2)

#########################################################################

                                    # model selection and goodness of fit

                                       # example data chemotherapy (s.o.)
t1 <- c(26,  50,  51,  57,  70,  93, 105, 108, 135, 193,
        229, 241, 242, 263, 455, 489, 518, 566, 582, 595)
z1 <- c( 0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
         0,  0,  1,  1,  0,  0,  1,  0,  1,  1); c1 <- rep(1, length(t1))
t2 <- c( 4,   8,  10,  18,  30,  55,  56,  71, 89, 90,
         101, 148, 155, 207, 233, 266, 283, 441)
z2 <- c( 0,  0,  0,  0,  1,  1,  0,  0,  1,  1,
         1,  1,  1,  0,  1,  0,  1,  0);         c2 <- rep(2, length(t2))
group <- c(c1,c2); time <- c(t1,t2); status <- c(z1,z2);
chemo <- data.frame(group, time, status)
chemo$group <- as.factor(chemo$group)

########################################################################
isy <- function(x) {                                 # check for infnity
  for (i in 1:length(x)) if (is.infinite(x[i])) x[i] <- NA
  return(x)        }
########################################################################
library(survival)
########################################################################
fitKM <- survfit(Surv(time, status) ~ group, data=chemo)
n1 <- fitKM$n[1];        n2 <- fitKM$n[2]
t1 <- fitKM$time[1:n1];  t2 <- fitKM$time[(n1+1):(n1+n2)]
s1 <- fitKM$surv[1:n1];  s2 <- fitKM$surv[(n1+1):(n1+n2)]
########################################################################
cat("\n","---> exponential model","\n")
fit1 <- survreg(Surv(time, status) ~ group, 
                data = chemo, dist = "exponential")
cat("\n","---> Weibull model","\n")
fit2 <- survreg(Surv(time, status) ~ group, 
                data = chemo, dist = "weibull")
cat("\n","---> loglogistic model","\n")
fit3 <- survreg(Surv(time, status) ~ group, 
                data = chemo, dist = "loglogistic")
########################################################################

                                                           # figure 8.34
par(mfcol=c(1,3), lwd=2, font.axis=1.8, bty="n", ps=20, bty="l")
                                                 # exponential model
l.1 <- coef(fit1)[1] + coef(fit1)[2]*0            
l.2 <- coef(fit1)[1] + coef(fit1)[2]*1
h.1 <- exp(-l.1)                                 # hazard                             
h.2 <- exp(-l.2)
s.1 <- exp(-exp(-l.1) * t1)                      # survival 
s.2 <- exp(-exp(-l.2) * t2) 

y.1 <- -log(s1); y.2 <- -log(s2)
x.1 <- t1;       x.2 <- t2
plot(x.1, y.1,  type="p", pch=1, cex=1.5, las=1,
     xlim=c(0, 600), ylim=c(0,1), 
     main ="exponential model",
     ylab="-log(S(t))",
     xlab="Time [t]")
abline(a=0, b=h.1, lty=1)
points(x.2, y.2, pch=2, cex=1.5)
abline(a=0, b=h.2, lty=2)
legend("bottomright", c("Therapy 1","Therapy 2"),
       bty = "n", pch=c(1,2), col=c(1,1), cex=1)
                                                 # Weibull model
l.1 <- coef(fit2)[1] + coef(fit2)[2]*0              
l.2 <- coef(fit2)[1] + coef(fit2)[2]*1;  shape <- 1/fit2$scale
h.1 <- exp(-l.1)^shape                           # hazard                             
h.2 <- exp(-l.2)^shape
s.1 <- exp(-(t1*exp(-l.1))^shape)                # survival
s.2 <- exp(-(t2*exp(-l.2))^shape)              

y.1 <- isy(log(-log(s1)));      y.2 <- isy(log(-log(s2)))
x.1 <- log(t1);                 x.2 <- log(t2)
plot(x.1, y.1,  type="p", pch=1, cex=1.5,las=1,
     xlim=c(3, 6.5), ylim=c(-4, 1),
     main ="Weibull model",
     ylab="log(-log(S(t)))",
     xlab="Time [log(t)]")
abline(a=log(h.1), b=shape, lty=1)
points(x.2, y.2, pch=2, cex=1.5)
abline(a=log(h.2), b=shape, lty=2)
legend("bottomright", c("Therapy 1","Therapy 2"),
       bty = "n", pch=c(1,2), col=c(1,1), cex=1)

                                                 # log-logistic model
l.1 <- coef(fit3)[1] + coef(fit3)[2]*0              
l.2 <- coef(fit3)[1] + coef(fit3)[2]*1;   shape <- 1/fit3$scale
h.1 <- exp(-l.1)^shape                           # hazard                             
h.2 <- exp(-l.2)^shape
s.1 <- 1 / (1 + (exp(-l.1)*t1)^shape)            # survival 
s.2 <- 1 / (1 + (exp(-l.2)*t2)^shape)

y.1 <- isy(log((1-s1)/s1));     y.2 <- isy(log((1-s2)/s2))
x.1 <- log(t1);                 x.2 <- log(t2)
plot(x.1, y.1,  type="p", pch=1, cex=1.5,las=1,
     xlim=c(3, 6.5), ylim=c(-4, 1),
     main ="log-logistic model",
     ylab="log((1-S(t))/S(t))",
     xlab="Time [log(t)]")
abline(a=log(h.1), b=shape, lty=1)
points(x.2, y.2, pch=2, cex=1.5)
abline(a=log(h.2), b=shape, lty=2)
legend("bottomright", c("Therapy 1","Therapy 2"),
       bty = "n", pch=c(1,2), col=c(1,1), cex=1)

#########################################################################

                                         # Daten zur Tumortherapie (s.o.)
t1 <- c(26,  50,  51,  57,  70,  93, 105, 108, 135, 193,
        229, 241, 242, 263, 455, 489, 518, 566, 582, 595)
z1 <- c( 0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
         0,  0,  1,  1,  0,  0,  1,  0,  1,  1); c1 <- rep(1, length(t1))
t2 <- c( 4,   8,  10,  18,  30,  55,  56,  71, 89, 90,
         101, 148, 155, 207, 233, 266, 283, 441)
z2 <- c( 0,  0,  0,  0,  1,  1,  0,  0,  1,  1,
         1,  1,  1,  0,  1,  0,  1,  0);         c2 <- rep(2, length(t2))
group <- c(c1,c2); time <- c(t1,t2); status <- c(z1,z2);
chemo <- data.frame(group, time, status)
chemo$group <- as.factor(chemo$group)
nc <- length(chemo$time)
                                                      # exponential model
L0  <- fit1$loglik[1]; L <- fit1$loglik[2]                  # likelihood
G1  <- 2*(L - L0); p1 <- pchisq(G1, 1, lower.tail=FALSE)    # G-statistic
RC1 <-  1-(exp(L0)/exp(L))^(2/nc)                           # Cox-Snell
RN1 <- RC1 / (1-(exp(L0)^(2/nc)))                           # Nagelkerke 
                                                      # Weibull model
L0  <- fit2$loglik[1]; L <- fit2$loglik[2]                  # likelihood
G2  <- 2*(L - L0); p2 <- pchisq(G2, 1, lower.tail=FALSE)    # G-statistic
RC2 <-  1-(exp(L0)/exp(L))^(2/nc);                          # Cox-Snell
RN2 <- RC2 / (1-(exp(L0)^(2/nc)))                           # Nagelkerke 
                                                      # log-logistic model
L0  <- fit3$loglik[1]; L <- fit3$loglik[2]                  # likelihood
G3  <- 2*(L - L0); p3 <- pchisq(G3, 1, lower.tail=FALSE)    # G-statistic
RC3 <-  1-(exp(L0)/exp(L))^(2/nc)                           # Cox-Snell
RN3 <- RC3 / (1-(exp(L0)^(2/nc)))                           # Nagelkerke  

tab  <- matrix(data = NA, nrow = 3, ncol = 3, byrow = FALSE,
               dimnames = NULL)
tab[1,1] <- round(G1, 4); tab[1,2] <-  round(p1, 4); tab[1,3] <-  round(RN1, 4) 
tab[2,1] <- round(G2, 4); tab[2,2] <-  round(p2, 4); tab[2,3] <-  round(RN2, 4)
tab[3,1] <- round(G3, 4); tab[3,2] <-  round(p3, 4); tab[3,3] <-  round(RN3, 4)
tab

#library(xtable)
#xtable(tab, digits=4)

#########################################################################

                                               # accelerated failure time

t      <- seq(0, 50, 2)                                     # figure 8.35
hazard <- 0.05
par(mfcol=c(1,1), lwd=2, font.axis=1.5, bty="n", ps=15, bty="l")
surv1   <- exp(-hazard*t)
plot(t, surv1, type="l", las=1,
     xlab="Survival time  t", ylab="Survival probability S(t)")
surv2   <- exp(-hazard*2*t)
lines(t,surv2)
text(25, 0.6, expression(paste(plain("Acceleration "), gamma == 2)), cex=1.5)
text(25, 0.5, expression(S[1](t) == S[2](2 * t)), cex=1.5)
text(10, 0.7, expression(S[1](t)), cex=1.2)
text(1.8, 0.7, expression(S[2](t)), cex=1.2)
lines(t[1:3], rep(0.8, 3), lty=2)
lines(t[1:6], rep(0.6, 6), lty=2)
lines(t[1:10], rep(0.4, 10), lty=2)
lines(t[1:16], rep(0.2, 16), lty=2)


#########################################################################

                                                          # Section 8.7.4

                                         # Cox proportional hazards model
library(survival)
data(ovarian)
attach(ovarian)
#summary(survfit( Surv(futime, fustat)~1 , data=ovarian))
fit <- coxph(Surv(futime, fustat) ~ age + rx + resid.ds + ecog.ps, 
                        data=ovarian)
summary(fit)

#########################################################################
                                             
library(survival)                                    # model - prediction
data(ovarian)
fit <- coxph(Surv(futime,fustat) ~ age + rx, ovarian)
fit

                                                           # figure 8.36
par(mfcol=c(1,1),lwd=2, font.axis=2, bty="l", ps=15)
plot( survfit(fit), conf.int=FALSE, lty=2, las=1,
        xlim=c(0,700), xlab="Time [days]", lwd=2,
        ylab="Probability")
lines(survfit(fit, newdata=data.frame(age=40, rx=2)), 
       conf.int=FALSE, col="blue", lwd=3)
lines(survfit(fit, newdata=data.frame(age=60, rx=1)),
       conf.int=FALSE, col="red", lwd=3)
legend(420, 0.93, "Age 40 years, therapy 2", bty="n", cex=1.3)
legend(170, 0.40, "Age 60 years, therapy 1", bty="n", cex=1.3)

#########################################################################

                               # nomogram from linear predictor (scoring)
library(rms); data(ovarian)
df <- ovarian
df$rx       <- factor(df$rx, levels=c(1,2), labels=c("group A","group B"))
names(df)[5] <- "treatment"
d <- datadist(df); options(datadist="d")        
fit <- cph(Surv(futime,fustat) ~ age + treatment, surv=T, data = df)
surv   <- Survival(fit)
surv1  <- function(lp) surv(365, lp)                # one year survival
at.surv <- c(0.05, 0.25, 0.5, 0.75, 0.95, 0.98)
nom <- nomogram(fit, conf.int=F, fun=list(surv1),
                funlabel=c('1-year survival probability'), lp=F,
                fun.at=c(at.surv))
par(cex=1.2); plot(nom, lwd=2.5)

#########################################################################

                                        # selecting influencing variables
fitm <- coxph(Surv(futime, fustat) ~ age + rx + resid.ds + ecog.ps, ovarian)
fitm$loglik[1]; fitm$loglik[2]

fit1 <- update(fitm, . ~ . -ecog.ps)
gm   <- 2*(fitm$loglik[2]-fitm$loglik[1]); gm; 
pchisq(gm, 4, lower.tail=FALSE)
g1   <- 2*(fit1$loglik[2]-fit1$loglik[1]); g1; 
pchisq(g1, 3, lower.tail=FALSE)
fit2 <- update(fit1, . ~ . - resid.ds)
g2   <- 2*(fit2$loglik[2]-fit2$loglik[1]); g2; 
pchisq(g2, 2, lower.tail=FALSE)
fit3 <- update(fit2, . ~ . - age)
g3   <- 2*(fit3$loglik[2]-fit3$loglik[1]); g3; 
pchisq(g3, 2, lower.tail=FALSE)

#########################################################################

                                                               # step AIC
library(MASS)
fit  <- coxph(Surv(futime, fustat) ~ age + rx + resid.ds + ecog.ps, data=ovarian) 
summary(stepAIC(fit, upper = ~ age + rx + resid.ds + ecog.ps,trace=TRUE))

#########################################################################

library(survival)                   # quality of prediction - concordance
data(ovarian)
fit <- coxph(Surv(futime,fustat) ~ age + rx, ovarian)
concordance(fit)

#########################################################################

                                                    # Cox-Snell residuals      
library(survival)
data(ovarian)
fit0 <- coxph(Surv(futime, fustat) ~ 1, ovarian)
fitm <- coxph(Surv(futime, fustat) ~ age + rx + resid.ds + ecog.ps, ovarian)
m.resid  <- resid(fitm)
cs.resid <- ovarian$fustat - m.resid
km.cs    <- survfit(Surv(cs.resid, ovarian$fustat)~1)
cs.times <- km.cs$time
cs.S     <- km.cs$surv
cs.exp   <- -log(cs.S)
                                                            # figure 8.38         
par(mfcol=c(1,1),lwd=2, font.axis=2, bty="n", ps=15)
plot(cs.times, cs.exp, type="b", xlab="Cox-Snell residuals",
               las=1, ylab="Cumulative risk")
abline(0, 1, lty=2)    

#########################################################################

                                                   # martingale residuals

                                                            # figure 8.39    
par(mfcol=c(1,2),lwd=2, font.axis=2, bty="n", ps=15)
fit0 <- coxph(Surv(futime, fustat) ~ 1, ovarian)  
scatter.smooth(ovarian$age, resid(fit0), las=1,
               xlab="Age [years]", ylim=c(-1,1),
ylab="Martingale residuals (null model)"); abline(h=0, lty=2)  
scatter.smooth(ovarian$rx, resid(fit0), las=1,
               xlab="Residual disease", ylim=c(-1,1),
ylab="Martingale residuals (null model)"); abline(h=0, lty=2)     

#########################################################################

                                                   # Schoenfeld residuals
fit.age <- coxph(Surv(futime, fustat)~age, data=ovarian)
detail <-  coxph.detail(fit.age)
time   <- detail$y[,2]
stat   <- detail$y[,3]
res    <- resid(fit.age, type="schoenfeld")
                                                    # Abbildung 8.30     
par(mfcol=c(1,1),lwd=2, font.axis=2, bty="n", ps=14)
scatter.smooth(time[stat==1], res, ylim=c(-20,10), las=1,
              xlab="Time [days]", ylab="Schoenfeld residuals to age")   
abline(h=0, lty=2)

#########################################################################


